{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cleanlab\n",
    "import numpy as np\n",
    "from cleanlab.models.fasttext import data_loader\n",
    "import pandas as pd\n",
    "from IPython.display import display, HTML\n",
    "pd.options.display.max_colwidth = 1000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Stored results directory\n",
    "pyx_dir = '/datasets/cgn/pyx/amazon/'\n",
    "pyx_file = 'amazon_pyx_cv_3fold.npy'\n",
    "\n",
    "# # Load pyx\n",
    "# with open(pyx_dir + pyx_file, 'rb') as f:\n",
    "#     pyx = np.load(f)\n",
    "    \n",
    "# Load pyx \n",
    "fn = pyx_dir + 'amazon_pyx_cv__folds_3__epochs_10__lr_1.0__ngram_3__dim_100.npy'\n",
    "with open(fn, 'rb') as f:\n",
    "    pyx = np.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fetched probabilities for 9996437 examples and 3 classes.\n"
     ]
    }
   ],
   "source": [
    "print(\"Fetched probabilities for\", pyx.shape[0], 'examples and', pyx.shape[1], 'classes.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get data\n",
    "s = np.empty(pyx.shape[0], dtype=int)\n",
    "text = []\n",
    "loc = '/datasets/datasets/amazon5core/amazon5core.txt'\n",
    "bs = 1000000\n",
    "label_map = {'__label__1':0, '__label__3':1, '__label__5':2}\n",
    "for i, (l, t) in enumerate(data_loader(loc, batch_size=bs)):\n",
    "    s[bs*i:bs*(i+1)] = [label_map[lab] for lab in l]\n",
    "    text.append(t)\n",
    "text = [t for lst in text for t in lst]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cross-val accuracy: 90.44%\n"
     ]
    }
   ],
   "source": [
    "crossval_acc = sum(pyx.argmax(axis=1) == s) / len(s)\n",
    "print('Cross-val accuracy: {:.2%}'.format(crossval_acc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Estimate the confident joint, a proxy for the joint distribution of label noise.\n",
    "cj, cj_only_label_error_indices = cleanlab.latent_estimation.compute_confident_joint(\n",
    "    s, pyx,\n",
    "    return_indices_of_off_diagonals=True,\n",
    ")\n",
    "py, nm, inv = cleanlab.latent_estimation.estimate_latent(cj, s)\n",
    "\n",
    "# If you want to get label errors using cj_only method.\n",
    "cj_only_bool_mask = np.zeros(len(s), dtype=bool)\n",
    "for idx in cj_only_label_error_indices:\n",
    "    cj_only_bool_mask[idx] = True\n",
    "label_errors_idx = cleanlab.pruning.order_label_errors(cj_only_bool_mask, pyx, s, sorted_index_method='normalized_margin')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_errors(\n",
    "    label_errors_idx,\n",
    "    latex=False,\n",
    "    num_to_examine=200,\n",
    "    num_to_view=10,\n",
    "):\n",
    "    results = []\n",
    "    for i, idx in enumerate(label_errors_idx):\n",
    "        given_label = s[idx]\n",
    "        if 'sex' in text[idx] or 'serrated' in text[idx]:\n",
    "            continue  # Don't add profanity to our paper.\n",
    "        if len(text[idx]) > 30 and len(text[idx]) < 120:\n",
    "            given_str = ('5' if given_label == 2 else ('3' if given_label == 1 else '1')) + 'cgn'\n",
    "            results.append({\n",
    "                'Review': text[idx],\n",
    "                'Given Label': given_str,\n",
    "                'CL Guess': str([1,3,5][np.argmax(pyx[idx])]) + 'cgn',\n",
    "\n",
    "            })\n",
    "        if i > num_to_examine:\n",
    "            break\n",
    "    \n",
    "    df = pd.DataFrame(results[:num_to_view])\n",
    "    display(df.set_index('Review', drop=True))\n",
    "    if latex:\n",
    "        tex = df.to_latex(index=False).replace('5cgn', '$\\star\\star\\star\\star\\star$')\n",
    "        tex = tex.replace('3cgn', '$\\star\\star\\star$')\n",
    "        tex = tex.replace('1cgn', '$\\star$')\n",
    "        print(tex)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [],
   "source": [
    "# To estimate the label errors with confident learning\n",
    "label_errors_idx = cleanlab.pruning.get_noise_indices(\n",
    "    s=s,\n",
    "    psx=pyx,\n",
    "    confident_joint=cj,\n",
    "    prune_method='both',\n",
    "    sorted_index_method='normalized_margin',  # ['prob_given_label', 'normalized_margin']\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Given Label</th>\n",
       "      <th>CL Guess</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Review</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A very good addition to kindle. Cleans and scans. Very easy  TO USE</th>\n",
       "      <td>1cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Buy it and enjoy a great story.</th>\n",
       "      <td>3cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Works great! I highly recommend it to everyone that enjoys singing hymns! Love it! Love it! Love it! :) .</th>\n",
       "      <td>3cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Awesome it was better than all the other my weirder school books.  I love it! The best book ever.Awesome</th>\n",
       "      <td>1cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>I gave this 5 stars under duress.  I would rather give it 3 stars.  it plays fine but it is a little boring so far.</th>\n",
       "      <td>5cgn</td>\n",
       "      <td>3cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>only six words: don't waist your money on this</th>\n",
       "      <td>5cgn</td>\n",
       "      <td>1cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>I love it so much at first I though it would be boring but turns out its fun for all ages get it</th>\n",
       "      <td>1cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Excellent read, could not put it down! Keep up the great works ms. Brown. Cannot wait to download the next one.</th>\n",
       "      <td>1cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>This is one of the easiest to use games I have ever played. It is adaptable and fun. I love it.</th>\n",
       "      <td>1cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>So this is what today's music has become?</th>\n",
       "      <td>1cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Sarah and Charlie, what a wonderful story. I loved this book and look forward to reading more of this series.</th>\n",
       "      <td>3cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>I've had this for over a year and it works very well.  I am very happy with this purchase.</th>\n",
       "      <td>1cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>this show is insane and I love it. I will be ordering more seasons of it.</th>\n",
       "      <td>3cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Just what the world needs, more generic r&amp;b.</th>\n",
       "      <td>1cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>I did like the Making Of This Is movie it  okay it not the best okay it not great  .</th>\n",
       "      <td>1cgn</td>\n",
       "      <td>3cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Tough game. But of course it has the very best sound track ever!</th>\n",
       "      <td>1cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unexpected kid on the way thanks to this shit</th>\n",
       "      <td>1cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>The kids are fascinated by it, Plus my wife loves it.. I love it I love it we love it</th>\n",
       "      <td>3cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Loved this book! A great story and insight into the time period and life during those times. Highly recommend this book</th>\n",
       "      <td>3cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Great reading I could not put it down. Highly recommend reading this book. You will not be disappointed. Must read.</th>\n",
       "      <td>3cgn</td>\n",
       "      <td>5cgn</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                        Given Label  \\\n",
       "Review                                                                                                                                \n",
       "A very good addition to kindle. Cleans and scans. Very easy  TO USE                                                            1cgn   \n",
       "Buy it and enjoy a great story.                                                                                                3cgn   \n",
       "Works great! I highly recommend it to everyone that enjoys singing hymns! Love it! Love it! Love it! :) .                      3cgn   \n",
       "Awesome it was better than all the other my weirder school books.  I love it! The best book ever.Awesome                       1cgn   \n",
       "I gave this 5 stars under duress.  I would rather give it 3 stars.  it plays fine but it is a little boring so far.            5cgn   \n",
       "only six words: don't waist your money on this                                                                                 5cgn   \n",
       "I love it so much at first I though it would be boring but turns out its fun for all ages get it                               1cgn   \n",
       "Excellent read, could not put it down! Keep up the great works ms. Brown. Cannot wait to download the next one.                1cgn   \n",
       "This is one of the easiest to use games I have ever played. It is adaptable and fun. I love it.                                1cgn   \n",
       "So this is what today's music has become?                                                                                      1cgn   \n",
       "Sarah and Charlie, what a wonderful story. I loved this book and look forward to reading more of this series.                  3cgn   \n",
       "I've had this for over a year and it works very well.  I am very happy with this purchase.                                     1cgn   \n",
       "this show is insane and I love it. I will be ordering more seasons of it.                                                      3cgn   \n",
       "Just what the world needs, more generic r&b.                                                                                   1cgn   \n",
       "I did like the Making Of This Is movie it  okay it not the best okay it not great  .                                           1cgn   \n",
       "Tough game. But of course it has the very best sound track ever!                                                               1cgn   \n",
       "unexpected kid on the way thanks to this shit                                                                                  1cgn   \n",
       "The kids are fascinated by it, Plus my wife loves it.. I love it I love it we love it                                          3cgn   \n",
       "Loved this book! A great story and insight into the time period and life during those times. Highly recommend this book        3cgn   \n",
       "Great reading I could not put it down. Highly recommend reading this book. You will not be disappointed. Must read.            3cgn   \n",
       "\n",
       "                                                                                                                        CL Guess  \n",
       "Review                                                                                                                            \n",
       "A very good addition to kindle. Cleans and scans. Very easy  TO USE                                                         5cgn  \n",
       "Buy it and enjoy a great story.                                                                                             5cgn  \n",
       "Works great! I highly recommend it to everyone that enjoys singing hymns! Love it! Love it! Love it! :) .                   5cgn  \n",
       "Awesome it was better than all the other my weirder school books.  I love it! The best book ever.Awesome                    5cgn  \n",
       "I gave this 5 stars under duress.  I would rather give it 3 stars.  it plays fine but it is a little boring so far.         3cgn  \n",
       "only six words: don't waist your money on this                                                                              1cgn  \n",
       "I love it so much at first I though it would be boring but turns out its fun for all ages get it                            5cgn  \n",
       "Excellent read, could not put it down! Keep up the great works ms. Brown. Cannot wait to download the next one.             5cgn  \n",
       "This is one of the easiest to use games I have ever played. It is adaptable and fun. I love it.                             5cgn  \n",
       "So this is what today's music has become?                                                                                   5cgn  \n",
       "Sarah and Charlie, what a wonderful story. I loved this book and look forward to reading more of this series.               5cgn  \n",
       "I've had this for over a year and it works very well.  I am very happy with this purchase.                                  5cgn  \n",
       "this show is insane and I love it. I will be ordering more seasons of it.                                                   5cgn  \n",
       "Just what the world needs, more generic r&b.                                                                                5cgn  \n",
       "I did like the Making Of This Is movie it  okay it not the best okay it not great  .                                        3cgn  \n",
       "Tough game. But of course it has the very best sound track ever!                                                            5cgn  \n",
       "unexpected kid on the way thanks to this shit                                                                               5cgn  \n",
       "The kids are fascinated by it, Plus my wife loves it.. I love it I love it we love it                                       5cgn  \n",
       "Loved this book! A great story and insight into the time period and life during those times. Highly recommend this book     5cgn  \n",
       "Great reading I could not put it down. Highly recommend reading this book. You will not be disappointed. Must read.         5cgn  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\begin{tabular}{lll}\n",
      "\\toprule\n",
      "                                                                                                                  Review & Given Label & CL Guess \\\\\n",
      "\\midrule\n",
      "                                                     A very good addition to kindle. Cleans and scans. Very easy  TO USE &        $\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "                                                                                         Buy it and enjoy a great story. &        $\\star\\star\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "               Works great! I highly recommend it to everyone that enjoys singing hymns! Love it! Love it! Love it! :) . &        $\\star\\star\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "                Awesome it was better than all the other my weirder school books.  I love it! The best book ever.Awesome &        $\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "     I gave this 5 stars under duress.  I would rather give it 3 stars.  it plays fine but it is a little boring so far. &        $\\star\\star\\star\\star\\star$ &     $\\star\\star\\star$ \\\\\n",
      "                                                                          only six words: don't waist your money on this &        $\\star\\star\\star\\star\\star$ &     $\\star$ \\\\\n",
      "                        I love it so much at first I though it would be boring but turns out its fun for all ages get it &        $\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "         Excellent read, could not put it down! Keep up the great works ms. Brown. Cannot wait to download the next one. &        $\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "                         This is one of the easiest to use games I have ever played. It is adaptable and fun. I love it. &        $\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "                                                                               So this is what today's music has become? &        $\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "           Sarah and Charlie, what a wonderful story. I loved this book and look forward to reading more of this series. &        $\\star\\star\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "                              I've had this for over a year and it works very well.  I am very happy with this purchase. &        $\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "                                               this show is insane and I love it. I will be ordering more seasons of it. &        $\\star\\star\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "                                                                            Just what the world needs, more generic r\\&b. &        $\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "                                    I did like the Making Of This Is movie it  okay it not the best okay it not great  . &        $\\star$ &     $\\star\\star\\star$ \\\\\n",
      "                                                        Tough game. But of course it has the very best sound track ever! &        $\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "                                                                           unexpected kid on the way thanks to this shit &        $\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "                                   The kids are fascinated by it, Plus my wife loves it.. I love it I love it we love it &        $\\star\\star\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      " Loved this book! A great story and insight into the time period and life during those times. Highly recommend this book &        $\\star\\star\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "     Great reading I could not put it down. Highly recommend reading this book. You will not be disappointed. Must read. &        $\\star\\star\\star$ &     $\\star\\star\\star\\star\\star$ \\\\\n",
      "\\bottomrule\n",
      "\\end{tabular}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print_errors(label_errors_idx, num_to_view=20, latex=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
